Loading and arranging the data

Setup:

knitr::opts_chunk$set(echo = TRUE, max.print = 100)

Load libraries:

library(readxl)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(stringi)
library(Cairo)
library(ape)
library(geosphere)
library(Matrix)
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(ggrepel)
library(ggpubr)
## 
## Attaching package: 'ggpubr'
## 
## The following object is masked from 'package:ape':
## 
##     rotate
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

Load data:

Phonotacticon <- read_xlsx("Phonotacticon.xlsx", guess_max = 1661)
PhonoBib <- read_xlsx("PhonoBib.xlsx")
PanPhon <- read_csv("PanPhonPhonotacticon1_0.csv") %>% 
  filter(!duplicated(ipa))
## Rows: 7416 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): ipa
## dbl (22): syl, son, cons, cont, delrel, lat, nas, strid, voi, sg, cg, ant, c...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Phonotacticon
PhonoBib
PanPhon

Create PanPhon regular expression:

PanPhonOrder <- PanPhon$ipa[order(-nchar(PanPhon$ipa), PanPhon$ipa)]
PanPhonRegex <- paste0("(?:", paste(PanPhonOrder, collapse="|"), ")")

str_trunc(PanPhonRegex, 100)
## [1] "(?:h͡d̪͡ɮ̪ʲʷ|h͡d̪͡ɮ̪ʷː|h͡d̪͡ɮ̪ʷˠ|h͡d̪͡ɮ̪ʷˤ|h͡d̪͡z̪ʲʷ|h͡d̪͡z̪ʷː|h͡d̪͡z̪ʷˠ|h͡d̪͡z̪ʷˤ|h͡t̪͡ɬ̪ʲʷ|h͡t̪..."

Subset Eurasian lects that are complete in Phonotacticon:

Eurasia <- Phonotacticon %>% 
  filter(Macroarea == 'Eurasia',
         Complete,
         complete.cases(P),
         O != "?",
         N != "?",
         C != "?",
         !grepl("[A-Z]|\\[", O),
         !grepl("[A-Z]|\\[", N),
         !grepl("[A-Z]|\\[", C))

Extract onset, nucleus, and coda sequences:

Sequences <- Eurasia %>% 
  select(Lect, O, N, C) %>% 
  pivot_longer(-Lect, names_to = 'Category', values_to = 'Sequence') %>% 
  separate(col = Sequence, 
           sep = ' ', 
           into = as.character(1:500),
           fill = 'right') %>% 
  pivot_longer(-c(Lect, Category), names_to = 'Number', values_to = 'Sequence') %>% 
  select(-Number) %>% 
  filter(!is.na(Sequence)) %>% 
  distinct()

Sequences

Split the sequences into segments:

Segments <- stri_extract_all_regex(Sequences$Sequence, 
                         pattern = PanPhonRegex,
                         simplify = TRUE) %>% 
  as_tibble(.name_repair = 'unique') %>% 
  mutate(Lect = Sequences$Lect,
         Category = Sequences$Category,
         Sequence = Sequences$Sequence) %>% 
  pivot_longer(cols = -c(Lect, Category, Sequence),
               names_to = 'Order',
               values_to = 'ipa') %>% 
  mutate(Order = Order %>% 
           as.factor() %>% 
           as.integer()) %>% 
  filter(ipa != "")
## New names:
## • `` -> `...1`
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
Segments

Measure the length of each sequence:

Sequences_length <- Segments %>% 
  count(Lect, Category, Sequence,
        name = 'Length')

Sequences_length

Join the length of each sequence to segments:

Segments <- left_join(Segments, Sequences_length)
## Joining, by = c("Lect", "Category", "Sequence")
Segments

Count the maximal length:

MaxLength <- max(Sequences_length$Length)

MaxLength
## [1] 5

Count the number of split segments:

Segments_number <- nrow(Segments)

Segments_number
## [1] 8697

Assign different positions to each sequence:

Sequences_rep <- bind_rows(rep(list(Segments), 5))

Sequences_rep <- Sequences_rep %>% 
  mutate(Position = rep(0:(MaxLength - 1),
                        each = Segments_number)) %>% 
  mutate(Order = Order + Position) %>% 
  filter(Length + Position <= MaxLength) %>% 
  select(-Length)

Sequences_rep

Calculating the phonological distance

Join segments with their phonological features:

Sequences_features <- Sequences_rep %>% 
  left_join(PanPhon, by = 'ipa') %>% 
  pivot_longer(cols = -c(Lect, 
                         Category, 
                         Sequence, 
                         Order, 
                         ipa, 
                         Position),
               names_to = 'Feature',
               values_to = 'Value') %>% 
  mutate(Feature = paste0(Feature, Order)) %>% 
  pivot_wider(names_from = 'Feature',
              values_from = 'Value') %>% 
  select(-Order, -ipa) %>% 
  pivot_longer(cols = -c(Lect, Category, Sequence, Position),
               names_to = 'Feature',
               values_to = 'Value') %>% 
  filter(Value != 'NULL') %>% 
  pivot_wider(names_from = 'Feature',
              values_from = 'Value') %>% 
  replace(is.na(.), 0)

Sequences_features

Paste sequence and position together:

Sequences_SeqPos <- Sequences_features %>%
  mutate(SequencePosition = paste0(Sequence, Position)) %>% 
  select(-Lect, -Category, -Position, -Sequence) %>% 
  select(SequencePosition, everything()) %>% 
  distinct()

Sequences_SeqPos

Calculate the distance between segments:

Sequences_distance <- Sequences_SeqPos %>% 
  select(-SequencePosition) %>%  
  dist(method = 'euclidean') %>% 
  as.matrix() %>%
  as_tibble(.name_repair = 'unique')

Sequences_distance <- bind_cols(Sequences_SeqPos$SequencePosition,
                                Sequences_distance)
## New names:
## • `` -> `...1`
colnames(Sequences_distance) <- c('SequencePosition',
                                 Sequences_SeqPos$SequencePosition)

Sequences_distance

Find the minimal distance among the different positions:

Sequences_MinDistance <- Sequences_distance %>% 
  pivot_longer(-SequencePosition, 
               names_to = 'Sequence.y', 
               values_to = 'Distance') %>% 
  mutate(Sequence.x = str_remove(SequencePosition, '[0-9]')) %>% 
  select(-SequencePosition) %>% 
  mutate(Sequence.y = str_remove(Sequence.y, '[0-9]')) %>% 
  group_by(Sequence.x, Sequence.y) %>% 
  summarise(Distance = min(Distance))
## `summarise()` has grouped output by 'Sequence.x'. You can override using the
## `.groups` argument.
Sequences_MinDistance

Subset the distance of /pl/ and other segments:

pl <- Sequences_MinDistance %>% 
  filter(Sequence.x == 'pl',
         Sequence.y != '∅') %>% 
  arrange(Distance)

pl

Subset the distance of /ia/ and other segments:

ia <- Sequences_MinDistance %>% 
  filter(Sequence.x == 'ia',
         Sequence.y != '∅') %>% 
  arrange(Distance)

ia

Compare the difference between two lects within the same category:

ONC_distance <- Sequences %>% 
  left_join(Sequences, by = c('Category')) %>%
  left_join(Sequences_MinDistance, 
            by = c('Sequence.x', 'Sequence.y')) %>% 
  group_by(Lect.x, Category, Sequence.x, Lect.y) %>% 
  summarize(Distance = min(Distance)) %>% 
  group_by(Lect.x, Category, Lect.y) %>% 
  summarize(Distance = mean(Distance)) %>% 
  mutate(Lect_vs_Lect = str_c(pmin(Lect.x, Lect.y),
                             "vs.",
                              pmax(Lect.x, Lect.y),
                              sep = " ")) %>% 
  group_by(Lect_vs_Lect, Category) %>% 
  mutate(Distance = max(Distance)) %>% 
  ungroup() %>% 
  select(Lect_vs_Lect, Category, Distance) %>% 
  distinct()
## `summarise()` has grouped output by 'Lect.x', 'Category', 'Sequence.x'. You can
## override using the `.groups` argument.
## `summarise()` has grouped output by 'Lect.x', 'Category'. You can override
## using the `.groups` argument.
ONC_distance

Count the number of tones in each lect:

Tones <- Eurasia %>%
  select(Lect, `T`) %>%
  mutate(`T` = gsub("\\-", NA, `T`)) %>%
  mutate(`T` = str_count(`T`, " ") + 1)

Tones[is.na(Tones$`T`),]$`T` <- 0

Tones

Calculate the Canberra distance between the numbers of tones of each pair of lects:

Tones_distance <- dist(Tones, method = 'canberra')
## Warning in dist(Tones, method = "canberra"): NAs introduced by coercion
Tones_distance[is.na(Tones_distance)] <- 0
Tones_distance <- Tones_distance %>%
    as.matrix() %>%
    as_tibble(.name_repair = 'unique')

Tones_distance <- bind_cols(Tones$Lect, Tones_distance)
## New names:
## • `` -> `...1`
colnames(Tones_distance) <- c('Lect', Tones$Lect)

Tones_distance <- Tones_distance %>%
    pivot_longer(-Lect, names_to = 'Lect2', values_to = 'Distance') %>%
    mutate(Lect_vs_Lect = str_c(pmin(Lect, Lect2),
                                "vs.",
                                pmax(Lect, Lect2),
                               sep = " "),
           Category = 'T') %>%
    select(Lect_vs_Lect, Category, Distance) %>%
    distinct()

Tones_distance

Bind segmental and tonal distances:

ONCT_distance <- rbind(ONC_distance,
                        Tones_distance) %>%
                  pivot_wider(names_from = Category,
                  values_from = Distance)

ONCT_distance

Normalize the distances:

ONCT_distance[, c('O', 'N', 'C', 'T')] <-
      scale(ONCT_distance[, c('O', 'N', 'C', 'T')])

ONCT_distance

Calculate the mean of the distances and obtain the phonological distances:

PhonoDist <- ONCT_distance %>%
  mutate(Distance = 
               rowMeans(ONCT_distance[, c('O', 'N', 'C', 'T')])) %>%
  mutate(Distance = Distance - min(Distance)) %>% 
  select(Lect_vs_Lect, Distance)

PhonoDist

Split the Lect_vs_Lect column of PhonoDist into Lect.x and Lect.y:

Lect_vs_Lect <- str_split_fixed(
  PhonoDist$Lect_vs_Lect, ' vs. ', n = 2) %>% 
  as_tibble(.name_repair = 'unique')
## New names:
## • `` -> `...1`
## • `` -> `...2`
colnames(Lect_vs_Lect) <- c('Lect.x', 'Lect.y')

Lect_vs_Lect

Visualize the phonological distances via multidimensional scaling:

PhonoScale <- PhonoDist %>% 
  bind_cols(Lect_vs_Lect) %>% 
  select(-Lect_vs_Lect) %>% 
  pivot_wider(names_from = Lect.y,
              values_from = Distance) %>% 
  select(-Lect.x) %>% 
  t() %>% 
  as.dist() %>% 
  cmdscale(k = 3) %>% 
  as.data.frame() %>% 
  rownames_to_column('Lect') %>% 
  as_tibble()
  
PhonoScale

Use the average silhouette method:

Silhouette <- PhonoScale %>% 
  select(-Lect) %>% 
  fviz_nbclust(kmeans, method = 'silhouette')

Silhouette

Find the optimal number of clusters:

ClusterNumber <- Silhouette$data %>% 
  as_tibble %>% 
  filter(y == max(y)) %>% 
  select(-y) %>% 
  as.numeric()

ClusterNumber
## [1] 2

Cluster PhonoScale into groups:

K <- PhonoScale %>% 
  select(-Lect) %>% 
  kmeans(ClusterNumber)

PhonoScale$group <- as.factor(K$cluster)

PhonoScale

Visualize the multidimensional scaling:

PhonoScalePlot <- ggplot(PhonoScale,
                         aes(x = V2, 
                             y = V1,
                             fill = V3,
                             shape = group,
                             label = Lect)) +
                  geom_point(show.legend = FALSE) +
                  geom_label_repel(size = 2,
                    min.segment.length = 0,
                    max.overlaps = 100,
                    force_pull = 100) +
  scale_shape_manual(values = 1:ClusterNumber) +
  scale_fill_gradient(low = 'white', high = 'red') +
  theme_classic() +
  coord_fixed()

cairo_pdf(file = 'PhonoScalePlot.pdf', 
          width = 7, 
          height = 8, 
          family = 'Times New Roman')
PhonoScalePlot
dev.off()
## quartz_off_screen 
##                 2
PhonoScalePlot

3D multidimensional scaling:

PhonoScale3D <- plot_ly(PhonoScale,
        x = ~V1,
        y = ~V2,
        z = ~V3,
        type = 'scatter3d',
        mode = 'text',
        text = ~Lect,
        color  = ~group,
        textfont = list(size = 10)) %>% 
  layout(showlegend = FALSE)

PhonoScale3D
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

Testing the correlation between phonological and geographical distances

Calculate the geographical distances between each pair of lects:

Lect_LonLat <- PhonoBib %>% 
  filter(Lect %in% Eurasia$Lect) %>% 
  select(Lect, lon, lat)

Lect_Kilometers <- Lect_LonLat %>% 
  mutate(Kilometers = 'Kilometers')

Coordinates <- Lect_Kilometers %>% 
  full_join(Lect_Kilometers, by = 'Kilometers')

Coordinates <- Coordinates %>% 
  mutate(Lect_vs_Lect = str_c(pmin(Lect.x, Lect.y),
                             "vs.",
                              pmax(Lect.x, Lect.y),
                              sep = " "))

Coordinates.x <- select(Coordinates, lon.x, lat.x)
Coordinates.y <- select(Coordinates, lon.y, lat.y)

GeoDist <- Coordinates %>% 
  mutate(Kilometers = 
           distHaversine(Coordinates.x, Coordinates.y) / 1000) %>% 
  select(Lect_vs_Lect, Kilometers) %>% 
  distinct()

GeoDist

Join the phonological distances and geographical distances:

PhonoGeoDist <- full_join(PhonoDist, GeoDist, by = 'Lect_vs_Lect')

PhonoGeoDist

Filter out the distance of a lect from itself:

PhonoGeoDist <- PhonoGeoDist %>% 
  filter(!(grepl('(.*) vs\\. \\1', Lect_vs_Lect)))

PhonoGeoDist

Linear regression between geographical distance and phonological distances:

PhonoGeoDist %>% 
  lm(formula = Distance ~ Kilometers) %>% 
  summary()
## 
## Call:
## lm(formula = Distance ~ Kilometers, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.73224 -0.37176  0.01898  0.36920  1.51790 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.827e+00  1.812e-02  100.85   <2e-16 ***
## Kilometers  8.702e-05  4.401e-06   19.77   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.516 on 2341 degrees of freedom
## Multiple R-squared:  0.1431, Adjusted R-squared:  0.1428 
## F-statistic:   391 on 1 and 2341 DF,  p-value: < 2.2e-16
PhonoGeoDist

Visualize the linear regression:

PhonoLM <- 
  ggplot(aes(x = Kilometers, y = Distance), data = PhonoGeoDist) +
  geom_point(alpha = 0.1) +
  geom_smooth(formula = y ~ x, method = 'lm', color = 'red') +
  theme_classic() +
  scale_x_continuous(name = 'Geographical distance (km)') +
  scale_y_continuous(name = 'Phonological distance (z)')

cairo_pdf(file = 'PhonoLM.pdf', 
          width = 4, 
          height = 3, 
          family = 'Times New Roman')
PhonoLM
dev.off()
## quartz_off_screen 
##                 2
PhonoLM